---
title: "Replication code for 'A Fair price to pay: exploiting causal graphs for fairness in
insurance' "
author: "Olivier Côté, Marie-Pier Côté, and Arthur Charpentier (2024)"
output: 
  html_document: 
    code_folding: show
    toc: yes
    toc_float: yes
    highlight: pygments
    theme: cosmo
---

Replication code for "A Fair price to pay: exploiting causal graphs for fairness in
insurance" by Olivier Côté, Marie-Pier Côté, and Arthur Charpentier (2024). Specifically, this reproduces the simulated data from example 5.


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
```

# Paquetage

```{r}
library(tidyverse)
library(latex2exp)
```

# Section 7

Simulations described in the beginning of section 7.

```{r}

p <- 0.5
n = 1e6

set.seed(321)

sims <- data.frame(D = rbinom(n, size = 1, prob = p),
                   eX = rnorm(n, mean = 1, sd = 3))

sims$gX <- 10 * (sims$D - p)
sims$X <- sims$eX + sims$gX
sims$eY <- rnorm(n, mean = 100 + 3 * sims$eX, sd = sqrt(30))
sims$gY <- 15 * (sims$D - p) + 3 * sims$gX
sims$Y <- sims$gY + sims$eY
                        
## Full df
df_full <- sims
```

Next, we code the score functions and apply it on a grid to illustrate.

```{r}
scores2 <- list("SB" = function(x, d, ex){100 + 3 * x + 15 * (d - p)},
  
               "SU" = function(x, d, ex){100 + 3 * x + 15 * (
  dnorm(x = x,
         mean = 1 + 10 * (1 - p),
         sd = 3)/(dnorm(x = x,
         mean = 1 + 10 * (1 - p),
         sd = 3) +  
      dnorm(x = x,
         mean = 1 + 10 * (0 - p),
         sd = 3)) - p) },
  
               "SA" = function(x, d, ex){100 + 3 * x},
  
               "SH" = function(x, d, ex){100 + 3 * x - 30 * (dnorm(x = x,
         mean = 1 + 10 * (1 - p),
         sd = 3)/(dnorm(x = x,
         mean = 1 + 10 * (1 - p),
         sd = 3) +  
      dnorm(x = x,
         mean = 1 + 10 * (0 - p),
         sd = 3)) - p)},

               "SC" = function(x, d, ex){100 + 3 * ex})



grid_to_test <- expand.grid(x = seq(-20, 20, by = 0.05),
                            d = c(0, 1))

grid_to_test$gx <- 10 * (grid_to_test$d - p)
grid_to_test$ex <- grid_to_test$x - grid_to_test$gx

levels_for_scores <- c("SB", "SU", "SA", "SH", "SC")
labels_for_scores <- c("$S^B(X, D)$","$S^U(X)$", "$S^A(X)$", "$S^H(X)$", "$S^C(\\epsilon_X)$")

df_to_g <- data.frame(grid_to_test, 
           levels_for_scores %>% 
  sapply(function(s){
    apply(grid_to_test, 1, function(l){scores2[[s]](x = l["x"],
                                                   d = l["d"],
                                                   ex = l["ex"])})
  }))



to_g <- df_to_g %>% 
  # mutate('SI' = SIX) %>% 
  # dplyr::select(-SIX, -gx) %>% 
  dplyr::select(-gx) %>%
  reshape2::melt(measure.vars = levels_for_scores) 

to_g$variable <- factor(to_g$variable, levels = levels_for_scores,
                  labels = labels_for_scores)


```

Here are the six individuals of Table 1.

```{r}
## Small df 
df_small <- data.frame("id" = 1:6,
                       "x" = c(-4, -4, 1, 1, 6, 6),
                 "d" = c(0, 1, 0, 1, 0 , 1))
df_small$gx <- 10 * (df_small$d - p)
df_small$ex <- df_small$x - df_small$gx

df_small$SB <- 100 + 3 * df_small$x + 15 * (df_small$d - p) 


df_small$p0 <- dnorm(x = df_small$x,
         mean = 1 + 10 * (0 - p),
         sd = 3)/(
           dnorm(x = df_small$x,
         mean = 1 + 10 * (1 - p),
         sd = 3) + 
      dnorm(x = df_small$x,
         mean = 1 + 10 * (0 - p),
         sd = 3))
df_small$p1 <- 1 - df_small$p0 
df_small$pd <- ifelse(df_small$d == 1, df_small$p1, df_small$p0)

df_small$SU <- 100 + 3 * df_small$x + 15 * (df_small$p1 - p) 
df_small$SA <- 100 + 3 * df_small$x 
df_small$SH <- 100 + 3 * df_small$x - 30 * (df_small$p1 - p)
df_small$SC <- 100 + 3 * df_small$ex 

##propensity

df_small_melted <- df_small %>% 
  reshape2::melt(measure.vars = levels_for_scores)


df_small_melted$variable <- factor(df_small_melted$variable, levels = levels_for_scores,
                  labels = labels_for_scores)
```

This produces Figure 9.

```{r figure9}
cols <- c(RColorBrewer::brewer.pal(8, "Dark2")[3:4])

## For multiple aestetics
scale_discrete_manual_ext <- function(aesthetics, values, name)
{
  lapply(aesthetics, function(aesthetic) {
    ggplot2::scale_discrete_manual(aesthetic, values = values[[aesthetic]], name = name)
  })
}

appender <- function(string) latex2exp::TeX(string) 


to_save_X <- 
  to_g %>%
  filter(x <= 10, x >= -10) %>% 
  ggplot(aes(x = x, y = value,
             col = factor(d),
             fill = factor(d),
             lty = factor(d),
             linewidth = factor(d),
             pch = factor(d))) +
  facet_grid(~variable, 
             labeller = as_labeller(appender, 
                            default = label_parsed,
                            multi_line = TRUE)) +
  geom_line(alpha = 0.5) +
  scale_color_manual(values = cols, name = "D") +
  scale_fill_manual(values = cols, name = "D") +
  scale_linetype_manual(values = c(1, 5), name = "D") +
  scale_discrete_manual_ext(c("pch", "linewidth"),
                            values = list(pch = c(3, 4),
                                          linewidth = c(2, 1)),
                            name = "D") +
  labs(x = "X", y = "Valeur de la prime") +
  scale_x_continuous(breaks = c(-8, -4, 0, 4, 8)) +
  scale_y_continuous(breaks = 7:13 * 10, limits = c(60, 140)) +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(-7,-7,-7,-7)) #+ 
  #   geom_point(data = df_small_melted,
  #            alpha = 0.9,
  #            size = 2,
  #            stroke = 1.75) +
  # ggrepel::geom_text_repel(data = df_small_melted,
  #           aes(label = id),
  #           size = 3, fill = "white",
  #           direction = "both",
  #           force = 35,
  #           nudge_x = 1.5,
  #           nudge_y = -2.5)

ggsave(filename = "Fig9.pdf",
       plot = to_save_X,
       height = 3.25,
       width = 8.55,
       units = "in",
       device = "pdf", dpi = 500)
```

This produces Figure 11.


```{r}
to_save_eX <- to_g %>% 
  filter(ex <= 15, ex >= -15) %>% 
  ggplot(aes(x = ex, y = value,
             col = factor(d),
             fill = factor(d),
             lty = factor(d),
             linewidth = factor(d),
             pch = factor(d))) +
  facet_grid(~variable, labeller = as_labeller(appender, 
                            default = label_parsed,
                            multi_line = TRUE)) +
  geom_line(alpha = 0.5) +
  scale_color_manual(values = cols, name = "D") +
  scale_fill_manual(values = cols, name = "D") +
  scale_linetype_manual(values = c(1, 5), name = "D") +
  scale_discrete_manual_ext(c("pch", "linewidth"),
                            values = list(pch = c(3, 4),
                                          linewidth = c(2, 1)),
                            name = "D") +
  labs(x = latex2exp::TeX("$\\epsilon_X$"), y = "Valeur de la prime") + 
  scale_x_continuous(breaks = c(-12, -6, 0, 6, 12)) + 
  scale_y_continuous(breaks = 4:10 * 15, limits = c(40, 160)) +
  theme_bw() + 
  theme(legend.position = "bottom",
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(-7,-7,-7,-7)) #+ 
  #   geom_point(data = df_small_melted,
  #              alpha = 0.9,
  #            size = 2,
  #            stroke = 1.75) + 
  # ggrepel::geom_text_repel(data = df_small_melted,
  #           aes(label = id),
  #           size = 3, fill = "white",
  #           direction = "both",
  #           force = 20,
  #           nudge_x = 1.5,
  #           nudge_y = -5)

ggsave(filename = "Fig11.pdf",
       plot = to_save_eX,
       height = 3.25,
       width = 8.55,
       units = "in",
       device = "pdf", dpi = 500)

```

This produces Figure 10.

```{r}
to_g_dist_xd <- data.frame(grid_to_test, 
                           "prob" = sapply(1:nrow(grid_to_test), function(t){
  dnorm(grid_to_test[t, "x"], mean = 1 + 10 * (grid_to_test[t, "d"] - 0.5), sd = 3)/(
    dnorm(grid_to_test[t, "x"], mean = 1 + 10 * (0 - 0.5), sd = 3) + dnorm(grid_to_test[t, "x"], mean = 1 + 10 * (1 - 0.5), sd = 3)
  )
})) 

df_small2 <- data.frame(df_small, 
                           "prob" = sapply(1:nrow(df_small), function(t){
  dnorm(df_small[t, "x"], mean = 1 + 10 * (df_small[t, "d"] - 0.5), sd = 3)/(
    dnorm(df_small[t, "x"], mean = 1 + 10 * (0 - 0.5), sd = 3) + dnorm(df_small[t, "x"], mean = 1 + 10 * (1 - 0.5), sd = 3)
  )
})) 

to_save_xd <- to_g_dist_xd %>% 
  filter(x >= -8, x <= 10) %>% 
  ggplot(aes(x = x, y = prob,
             col = factor(d),
             fill = factor(d),
             lty = factor(d),
             linewidth = factor(d),
             pch = factor(d))) +
  geom_line(alpha = 0.5) +
  scale_color_manual(values = cols, name = "d") +
  scale_fill_manual(values = cols, name = "d") +
  scale_linetype_manual(values = c(1, 5), name = "d") +
  scale_discrete_manual_ext(c("pch", "linewidth"),
                            values = list(pch = c(3, 4),
                                          linewidth = c(1.7, 0.85)),
                            name = "d") +
  labs(x = latex2exp::TeX("$x$"), y = latex2exp::TeX("$Pr(D = d|X = x)$")) + 
  scale_x_continuous(breaks = c(-3:3)*3 + 1) + 
  theme_bw() #+ 
  #   geom_point(data = df_small2,
  #            alpha = 0.9,
  #            size = 1.5,
  #            stroke = 1.25) + 
  # ggrepel::geom_text_repel(data = df_small2,
  #           aes(label = id),
  #           size = 3, fill = "white",
  #           direction = "both", 
  #           force = 80,
  #           nudge_y = 0.001)

ggsave(filename = "Fig10.pdf",
       plot = to_save_xd,
       height = 2,
       width = 5,
       units = "in",
       device = "pdf", dpi = 500)
```

# Section 8

This produces Figure 13.

```{r}
to_g$dens <- dnorm(to_g$x, mean = 1 + 10 * (to_g$d - 0.5), sd = 3)
to_g <- to_g %>% group_by(d, variable) %>% 
  mutate(dens = dens/sum(dens))

to_g$dens %>%  sum


df_to_g_original <- data.frame(df_full, 
           levels_for_scores %>% 
  sapply(function(s){
    apply(df_full, 1, function(l){scores2[[s]](x = l["X"],
                                                   d = l["D"],
                                                   ex = l["eX"])})
  })) %>% reshape2::melt(measure.vars = levels_for_scores)

df_to_g <- df_to_g_original

df_to_g$variable <- factor(df_to_g$variable, levels = levels_for_scores,
                  labels = labels_for_scores)

df_to_g$D <- factor(df_to_g$D)

# df_to_g_agg <- df_to_g %>% 
#   group_by(D, variable) %>% 
#   summarise(n = str_pad(paste0('n = ', format(n(),
#                                       big.mark=" ",
#                                       trim=TRUE)),
#                         width = 13, side = "left"))

graph_independence <- df_to_g %>% 
  ggplot(aes(x = value,
             color = D,
             fill = D,
              lty = D)) + 
  facet_grid(rows = vars(variable),
             labeller = as_labeller(appender, 
                            default = label_parsed,
                            multi_line = TRUE)) + 
  scale_fill_manual(values = cols, name = "d") + 
  scale_color_manual(values = cols, name = "d") +
  scale_linetype_discrete(name = "d") +
  labs(x = "s", y = latex2exp::TeX("$\\hat{f}_{S|D}(s|d)$")) + 
  geom_density(alpha = 0.35, bw = 5, kernel = "gaussian", lwd = 0.8) + 
  scale_x_continuous(breaks = c(60, 80, 100, 120, 140),
                     limits = c(50, 153)) + 
  scale_y_continuous(breaks = c(0, 0.01, 0.02, 0.03)) +
  theme_minimal() + 
  theme(legend.position = "right", strip.text.y = element_text(size = 10)) #+
  # ggrepel::geom_label_repel(data = df_to_g_agg,
  #                          aes(color=D, 
  #                              label=n),
  #                          fill = 'white',
  #                          x=145,
  #                          y=0.045,
  #                          size = 2,
  #                          segment.colour = NA,
  #                          direction = 'y',
  #                          alpha=0.70,
  #                          force = 0.035,
  #                          label.padding = unit(0.10, "lines"),
  #                          fontface='bold',
  #                          inherit.aes = FALSE,
  #                          show.legend = FALSE)

ggsave(filename = "Fig13.pdf",
       plot = graph_independence,
       height = 6,
       width = 5,
       units = "in",
       device = "pdf", dpi = 500)

```

This produces Figure 14.

```{r}


put_labels <- function(groups, sorted_rr, round = 3, prefix = NULL){
  ## get unique values for groups
  values <- unique(groups)
  
  ## create labels according to relativities
  new_labels <- sapply(values, function(t){
    val <- sorted_rr[groups == t]
    
    if(sum(is.na(val)) > 0) return("NA") # for NA
    
    min_bracket <- round(min(val), round)
    max_bracket <- round(max(val), round)
    if(is.na(min_bracket)||is.na(max_bracket)){
      stop(paste0(c(t, min_bracket, max_bracket)))
    } else if((max_bracket - min_bracket) == 0){
      a <- paste0(min_bracket)
    } else {
      a <- paste0(min_bracket, prefix, max_bracket)
    }
    return(a)
  })
  ## Assign correct label to the correct level of the factor
  new_groups <- factor(groups)
  for (s in values) {
    levels(new_groups)[which(levels(new_groups) == s)] <- new_labels[s]
  }
  return(new_groups)
}
    

group_variable <- function(data,
                           variable,
                           exposure = NULL,
                           ngroups,
                           rounding = 3, 
                           prefix = NULL){
  
  ## We sort per value of the variable to group
  temp <- data[sort(data[[variable]], index.return = TRUE)$ix, ]
  
  ## depending on if exposure is specified, different treatment
  if(is.null(exposure)){
    expo_cumul <- 1:nrow(temp)
  } else {
    expo_cumul <- cumsum(temp[[exposure]])
  }
  
  cut_points <- nrow(temp) * (0:ngroups)/ngroups
  groups <- base::cut(x = expo_cumul,
                      include.lowest = TRUE,
                      breaks = cut_points,
                      labels = FALSE) %>%
        as.character %>%
        as.numeric
  
  new_groups <- put_labels(groups,
                           temp[[variable]],
                           round = rounding, 
                           prefix = prefix)
  temp[[paste0(variable, "_g")]] <- as.factor(new_groups)
  
  return(temp)
}

df_to_g_suff <- group_variable(data = df_to_g %>% 
                            filter(value <= 113, value >= 93),
                          variable = "value",
                          ngroups = 5,
                          rounding = 0,
                          prefix = " $\\leq S < $") 

# levels(df_to_g_suff$D) <- 0:1

df_to_g_suff_agg <- df_to_g_suff %>% 
  group_by(D, value_g, variable) %>% 
  summarise(n = str_pad(paste0('n = ', format(n(),
                                  big.mark=" ",
                                  trim=TRUE)),
                        width = 13,
                        side = 'both'))

graph_suff <- df_to_g_suff %>% 
  ggplot(aes(x = Y,
             color = D,
             fill = D,
              linetype = D)) + 
  facet_grid(value_g~variable,
             labeller = as_labeller(appender, 
                            default = label_parsed,
                            multi_line = TRUE)) + 
  scale_fill_manual(values = cols, name = "d") + 
  scale_color_manual(values = cols, name = "d") +
  scale_linetype_discrete(name = "d") +
  labs(x = "y", y = latex2exp::TeX("$\\hat{f}_{Y|D, S}(y|d, s)$")) + 
  geom_density(alpha = 0.4, kernel = "gaussian", bw = 5) + 
  theme_minimal() + 
  theme(strip.text.x = element_text(size = 10),
        strip.text.y = element_text(size = 8)) + 
  scale_x_continuous(breaks = c(60, 100, 145), 
                     limits = c(53, 153)) +
  scale_y_continuous(breaks = c(0, 0.02, 0.04),
                     limits = c(0, 0.06)) #+
 #  ggrepel::geom_label_repel(data = df_to_g_suff_agg %>% 
 #                              filter(D == 0),
 #                           aes(label=n),
 #                           color = cols[1],
 #                           fill = 'white',
 #                           x=60,
 #                           y=0.06,
 #                           size = 2,
 #                           segment.colour = NA,
 #                           direction = 'y',
 #                           alpha=0.70,
 #                           force = 0,
 #                           label.padding = unit(0.10, "lines"),
 #                           fontface='bold',
 #                           inherit.aes = FALSE,
 #                           show.legend = FALSE) + 
 # ggrepel::geom_label_repel(data = df_to_g_suff_agg %>% 
 #                              filter(D == 1),
 #                           aes(label=n),
 #                           color = cols[2],
 #                           fill = 'white',
 #                           x=155,
 #                           y=0.06,
 #                           size = 2,
 #                           segment.colour = NA,
 #                           direction = 'y',
 #                           alpha=0.70,
 #                           force = 0,
 #                           label.padding = unit(0.10, "lines"),
 #                           fontface='bold',
 #                           inherit.aes = FALSE,
 #                           show.legend = FALSE)

ggsave(filename = "Fig14.pdf",
       plot = graph_suff,
       height = 5.5,
       width = 8,
       units = "in",
       device = "pdf", dpi = 500)

```

This produces Figure 15.

```{r}
df_to_g_sep <- group_variable(data = df_to_g %>% 
                            filter(Y <= 113, Y >= 93),
                          variable = "Y",
                          ngroups = 5,
                          rounding = 0,
                          prefix = " $\\leq Y < $") 

df_to_g_sep_agg <- df_to_g_sep %>% 
  group_by(D, Y_g, variable) %>% 
  summarise(n = str_pad(paste0('n = ', format(n(),
                                      big.mark=" ",
                                      trim=TRUE)),
                        width = 13, side = "left"))

df_to_g_sep_agg[['n']][df_to_g_sep_agg[['variable']] != labels_for_scores[3]] <- rep(NA, length(df_to_g_sep_agg[['n']][df_to_g_sep_agg[['variable']] != labels_for_scores[3]]))


graph_separation <- df_to_g_sep %>% 
  ggplot(aes(x = value,
             color = D,
             fill = D,
              linetype = D)) + 
  facet_grid(Y_g~variable,
             labeller = as_labeller(appender, 
                            default = label_parsed,
                            multi_line = TRUE)) + 
  scale_fill_manual(values = cols, name = "d") + 
  scale_color_manual(values = cols, name = "d") +
  scale_linetype_discrete(name = "d") +
  labs(x = "s", y = latex2exp::TeX("$\\hat{f}_{S|D, Y}(s|d, y)$")) + 
  geom_density(alpha = 0.4, kernel = "gaussian", bw = 6) + 
  theme_minimal() +
  theme(strip.text.x = element_text(size = 10),
        strip.text.y = element_text(size = 8)) + 
  scale_x_continuous(breaks = c(60, 100, 140), 
                     limits = c(53, 153)) +
  scale_y_continuous(breaks = c(0, 0.02, 0.04)) #+
 #  ggrepel::geom_label_repel(data = df_to_g_sep_agg %>% 
 #                              filter(D == 0),
 #                           aes(label=n),
 #                           color = cols[1],
 #                           fill = 'white',
 #                           x=60,
 #                           y=0.06,
 #                           size = 2,
 #                           segment.colour = NA,
 #                           direction = 'y',
 #                           alpha=0.70,
 #                           force = 0,
 #                           label.padding = unit(0.10, "lines"),
 #                           fontface='bold',
 #                           inherit.aes = FALSE,
 #                           show.legend = FALSE) + 
 # ggrepel::geom_label_repel(data = df_to_g_sep_agg %>% 
 #                              filter(D == 1),
 #                           aes(label=n),
 #                           color = cols[2],
 #                           fill = 'white',
 #                           x=155,
 #                           y=0.06,
 #                           size = 2,
 #                           segment.colour = NA,
 #                           direction = 'y',
 #                           alpha=0.70,
 #                           force = 0,
 #                           label.padding = unit(0.10, "lines"),
 #                           fontface='bold',
 #                           inherit.aes = FALSE,
 #                           show.legend = FALSE)

 # caption = "Due to lack of overlap between groups D = 0 and 1 on the domain of the variable Y \n, the graph is produced on a subset of simulations where Y is in [75, 125], which corresponds to 60% of the data"

ggsave(filename = "Fig15.pdf",
       plot = graph_separation,
       height = 5.5,
       width = 8,
       units = "in",
       device = "pdf", dpi = 500)
```
